package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.NumberUtils;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

/**
 * <新闻 - 四川新闻网> 解析类
 *
 * @author KK
 *
 */
public class NewsscNewsAnalyse implements AnalyseNews {

	
	public boolean isDetailPage(String url) {
		String[] regex={
				"http://(?!magazine)[a-z]*.newssc.org/system/[0-9]{4}/[0-9]{2}/[0-9]{2}/[0-9]*.shtml",
				"http://[a-z]*.newssc.org/system/[0-9]{6,8}/[0-9]*.htm",
				"http://[a-z]*.newssc.org/system/[0-9]{6,8}/[0-9]*.html",
				};

		return RegexUtils.matchAny(url,regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		if (urlMeta.getHtml() == null) {
		}
		String htmltxt=urlMeta.getHtml();
		String url=urlMeta.getUrl();

		String title=null;
		String content=null;
		Long date=null;
		String dateStr=null;
		String commnetNum=null;
		String clickNum=null;
		Document doc=Jsoup.parse(htmltxt);

			doc.select(".yellowuang").remove();//清除推荐阅读区域
			title=doc.select("div.title").text();
			dateStr=doc.select("div.info").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div.content"));

		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("table:eq(1)  table:eq(0) > tr:eq(0)  ").text();
			dateStr=doc.select("body > table > tr > td > table > tr > td > table > tr > td > div").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div#news_content"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("td.td_1 h2.f18").text();
			dateStr=doc.select("td.co_44").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div#dd,td.f14"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("div#main_left_title").text();
			dateStr=doc.select("div#main_left div.txt_center").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div#main_left_zw"));
		}

		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("div.Part_center  div.bsbt").text();
			dateStr=doc.select("div.Part_center  div.txt_12px").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div.Part_center  div.txt_zw"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("table#table839 td p.text_zzbt").text();
			dateStr=doc.select("table#table839 td div").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("table#table839 td.text_zz"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("td#main_left_title").text();
			dateStr=doc.select("td.jianjie").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("td.content14"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("td.title_b").text();
			dateStr=doc.select("td[align=center]").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("td.black_h"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("p.zz_bt").text();
			dateStr=doc.select("p.end").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("td.zz_text"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("div.newstitle").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("div.content p"));
		}
		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("article h1").text();
			content=HtmlCleaner.getContentHtml(url,doc.select("article section"));
		}

		if(StringUtils.isNullOrEmpty(title)){
			title=doc.select("title").text();
			if (title!=null){
				if (title.contains("\r\n")){
					title=StringUtils.substringBefore(title, "\r\n");
				}else{
					title=StringUtils.substringBeforeLast(title," ");
				}
			}
			date=JSoupUtils.matchDate(doc, "时间:");
			content=HtmlCleaner.getContentHtml(url,doc.select("div.content p"));
		}

		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("td.content14"));
		}
		if(date == null){
			date=DateUtils.matchDate(dateStr);
		}
		if(date == null){
			date=JSoupUtils.matchDate(doc, "来源");
		}
		NewsMeta news=new NewsMeta();
		news.setUrl(url);
		news.setType(1);
		news.setTitle(StringUtils.trimSpace(title));
		news.setCommentNum(NumberUtils.parseInt(commnetNum));
		news.setClickNum(NumberUtils.parseInt(clickNum));
		news.setContent(StringUtils.trimSpace(content));
		news.setDate(date);

		String author=doc.select("td.jianjie").text();
		if(StringUtils.isBlank(author)){
			author=doc.select("p.end").text();
			if(StringUtils.isBlank(author)){
				author=doc.select("span.qzbs").text();
			}
		}
		if(StringUtils.isBlank(author)){
			author=JSoupUtils.matchAuthor(doc, "来源：","来源:");
		}
		news.setAuthor(author);
		return news;
	}



	public static void main(String[] args) {
		NewsscNewsAnalyse a=new NewsscNewsAnalyse();
		String url="http://scnews.newssc.org/system/20150618/000573855.html";
		System.out.println(a.isDetailPage(url));
		UrlMeta meta=CrawlHTML.responseToURL(url);
		NewsMeta parserHtml=a.parserHtml(meta);
		System.out.println(parserHtml);
	}

	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}
	
	public boolean isNeedUpdate(){
		return false;
	}
}
